R code for lecture 27

# load data and fit models from last time
wells <- read.csv("ecol 563/wells.txt")
out1 <- glm(cbind(y, n-y)~ land.use + sewer, data=wells, family=binomial)
wells$land.use2 <- factor(ifelse(wells$land.use %in% c('agri','undev'), 'rural', 
as.character(wells$land.use)))
wells$land.use3 <- factor(ifelse(wells$land.use2 %in% c('inst','recr','resL','resM','trans'), 
'mixed', as.character(wells$land.use2)))
wells$land.use4 <- factor(ifelse(wells$land.use3 %in% c('resH','comm','indus'), 
'high.use', as.character(wells$land.use3)))
out2d <- glm(cbind(y,n-y)~land.use4+sewer, data=wells, family=binomial)
 
# predicted logits
predict(out2d)
# predicted probabilities
predict(out2d, type='response')
# predicted means
predict(out2d, type='response')*wells$n
ei <- predict(out2d, type='response')*wells$n
# expected number of contaminated and uncontaminated wells
Ei <- cbind(ei, wells$n-ei)
# observed number of contaminated and uncontaminated wells
Oi <- cbind(wells$y, wells$n-wells$y)
 
# Pearson statistic
sum((Oi-Ei)^2/Ei)
# p-value of Pearson statistic
1-pchisq(sum((Oi-Ei)^2/Ei),16)
# G-test statistic
sum(ifelse(Oi==0,0,2*Oi*log(Oi/Ei)))
# residual deviance = G-test statistic
deviance(out2d)
# degrees of freedom for G-test
df.residual(out2d)
# deviance goodness of fit
1-pchisq(deviance(out2d), df.residual(out2d))
 
# 35% of predicted counts are less than 5
sum(Ei<=5)/length(Ei)
 
# function for parametric bootstrap
sim.func <- function() {
obs <- sapply(1:nrow(wells), function(x) rbinom(1, prob=fitted(out2d)[x],
  size=wells$n[x]))
my.glm <- glm(cbind(obs,n-obs)~land.use4 + sewer, data=wells, family=binomial)
ei <- fitted(my.glm)*wells$n
Ei <- cbind(ei, wells$n-ei)
Oi <- cbind(obs, wells$n-obs)
# pearson test
pearson <- sum((Oi-Ei)^2/Ei)
# G-test
gtest <- sum(ifelse(Oi==0,0,2*Oi*log(Oi/Ei)))
# residual deviance
dev <- deviance(my.glm)
# return residual deviance
dev
}
 
# obtain 999 residual deviances from model
sims <- replicate(999, sim.func())
# append actual residual deviance to make 1000
sims <- c(sims, deviance(out2d))
# proportion of simulated values that exceed actual value
sum(sims>=deviance(out2d))/length(sims)
 
# if ratio of residual deviance to df > 1, overdispersion?
deviance(out2d)/df.residual(out2d)
sum((Oi-Ei)^2/Ei)
# if ratio of Pearson deviance to df > 1, overdispersion?
sum((Oi-Ei)^2/Ei)/df.residual(out2d)
 
# if overdispersion is real and cannot be fixed we can fit quasi-binomial
# quasi-binomial model
out2d.quasi <- glm(cbind(y,n-y)~land.use4+sewer, data=wells, family=quasibinomial)
summary(out2d.quasi)
 
# same estimates as binomial but different standard errors
summary(out2d.quasi)$coefficients
summary(out2d)$coefficients
 
# the standard errors are obtained from the Pearson deviance
phi <- sum((Oi-Ei)^2/Ei)/df.residual(out2d)
phi
sqrt(phi)*summary(out2d)$coefficients[,2]
 
# add continuous predictors to the logistic regression model
out2g <- update(out2d, .~.+nitrate)
out2h <- update(out2d, .~.+chloride)
out2i <- update(out2d, .~.+chloride+nitrate)
AIC(out2g, out2h, out2i, out2d)
anova(out2d, out2g, test='Chisq')
 
# is there a significant interaction?
out2j<-update(out2d, .~. +land.use4:sewer)
anova(out2d,out2j, test='Chisq')
 
# obtaining odds ratios
# OR for contamination: mixed versus high use
exp(coef(out2d)[2])
# OR for contamination: high use versus mixed
exp(-coef(out2d)[2])
 
# OR for contamination: sewers present versus sewers absent
exp(coef(out2d)[4])
# OR for contamination: mixed versus rural
exp(coef(out2d)[2]-coef(out2d)[3])
 
# OR for contamination for a one unit change in nitrate
exp(coef(out2g)[5])
# OR for contamination for a three unit change in nitrate
exp(3*coef(out2g)[5])
 
# confidence intervals for the regression parameters
confint(out2d)
out.ci <- confint(out2d)
 
# 95% confidence intervals for the odds ratios
# mixed versus high and rural versus high
exp(out.ci[2,])
 
# OR for contamination: high use versus mixed
exp(-coef(out2d)[2])
# 95% confidence interval for that odds ratio
exp(-out.ci[2,])
 
# OR for contamination: high use versus rural
exp(-coef(out2d)[3])
# 95% confidence interval for that odds ratio
exp(-out.ci[3,])
 
# log odds ratio for contamination: mixed versus rural
est <- c(0,1,-1,0)%*%coef(out2d)
# standard error of that log odds ratio
se <- sqrt(c(0,1,-1,0)%*%vcov(out2d)%*%c(0,1,-1,0))
 
# 95% confidence interval for the log odds ratio
est + c(qnorm(.025),qnorm(.975))*se
# 95% confidence interval for the odds ratio: mixed versus rural
exp(est + c(qnorm(.025),qnorm(.975))*se)
# point estimate
exp(est)
R Code for Lecture 27 (Wednesday, November 28, 2012)